## Import all data into data frames and recode as necessary
library(tidyverse)
library(tidytext)
library(textstem)
library(hunspell)
library(car)
library(reshape2)
library(stats)
library(devtools)
library(roxygen2)
library(readr)
library(readxl)
library(tidylog)
####IMPORTING DATA SETS
#d<-read.csv("zg_leadership3.csv")
d<-read.csv("data-raw/ehraf_leadership_TOTAL_FINAL_IRR.csv")
d<-d[c(1:1212),]
# Incorporating SCCS label, variables -------------------------------------
load('data-raw/sccs.RData')
sccs = as.data.frame(sccs)
# ID's of SCCS cultures that correspond to HRAF probability sample cultures
sccs.nums=c(19,37,79,172,28,7,140,76,121,109,124,24,87,12,69,181,26,51,149,85,112,125,16,94,138,116,158,57,102,4,34,182,13,127,52,62,165,48,42,36,113,152,100,16,132,98,167,154,21,120)
## adding variables SCCS variables
sccs.vars=c('SOCNAME','V61','V63', 'V64', 'V69','V70', 'V73', 'V76', 'V77', 'V79',
'V80', 'V81', 'V83', 'V84', 'V85', 'V93', 'V94', 'V156', 'V158', 'V158.1',
'V207', 'V208', 'V210', 'V234', 'V235', 'V236', 'V247', 'V270', 'V276',
'V573', 'V574', 'V575', 'V587', 'V666', 'V669', 'V670', 'V679',
'V756', 'V758', 'V759', 'V760', 'V761', 'V762', 'V763', 'V764', 'V765', 'V766', 'V767',
'V768', 'V769', 'V770', 'V773', 'V774', 'V775', 'V777', 'V778', 'V780', 'V785', 'V793', 'V794',
'V795', 'V796', 'V835', 'V836', 'V860', 'V866', 'V867', 'V868', 'V869', 'V902', 'V903', 'V905',
'V907', 'V910', 'V1133', 'V1134', 'V1648', 'V1683', 'V1684', 'V1685')
sccs.2 <- sccs[sccs.nums,sccs.vars]
Culture.codes <- read.delim("data-raw/Culture codes.txt")
Culture.codes$SCCS = as.character(Culture.codes$SCCS)
sccs.2 = merge(sccs.2, Culture.codes, by.x='row.names', by.y='SCCS')
cultures <- read.delim("data-raw/cultures.txt")
cultures = merge(cultures, sccs.2, by='Culture.code', all=T)
rm(sccs.2)
##d uses "c_culture_code", cultures uses "Culture.code" -- only two differences, e.g. two groups in PSF not in SCCS
cultures$c_culture_code<-cultures$Culture.code
## Isolating variables for MA focus
## Prestige/Dominance variables, VV variables, Neel variables (*d3 is main extract level data frame
##ORIGINAL CODE#d3 = d[,c(1:12, 27:35, 51:54, 61:67, 79:80)]
#d3<-d[,c(1:12,27:35,68,69,66,67,74:80,57,58)]
# Rows that are all zero on study variables == 1
# a=0
# for (i in 1:nrow(d3)){a[i] = 1*(sum(abs(d3[i,14:34]))==0)}
# d3$all.zero = a
# d4 = d3[d3$all.zero!=1,]
d$c_cultural_complexity <-as.numeric(d$c_cultural_complexity)
## Recode 0 to -1, and -1 to 0
tmp = as.matrix(d[,14:37])
tmp[tmp==0] = -2
tmp[tmp==-1] = 0
tmp[tmp==-2] = -1
d[,14:37] = tmp
rm(tmp)
##Culture level data
# Try to replicate d.ct in R
# PSF has Bahia Brazilians, but no leadership extracts from that culture, so delete that row
d.ct <-
read.csv('data-raw/culture_fmpro2.csv', stringsAsFactors=F) %>%
filter(c_name != 'Bahia Brazilians ') %>% # Not in d
left_join(as.data.frame(table(d$c_name), stringsAsFactors = F), by=c('c_name'='Var1')) %>%
rename(extract_count = Freq)
# This function calculates the total +1, 0, or -1 for a set of model vars (e.g., neel_vars) for each culture (on d3)
# Set 'vars' to the model variables and val to +1, 0, or -1
# The problem with this is that it assumes d3 and d.ct have both been sorted by c_name
# model_total = function(vars, val){
# as.numeric(by(d3[,vars], d3$c_name, FUN=function(x){sum(x==val, na.rm=T)}))
# }
model_total = function(vars, val, var_name){
x = by(d[,vars], d$c_name, FUN=function(x){sum(x==val, na.rm=T)})
y = data.frame(c_name=names(x), var_name=as.numeric(x))
names(y) = c('c_name', var_name)
return(y)
}
all_totals = function(vars, prefix){
d.ct[paste(prefix, '_total_cell_count', sep='')] = d.ct$extract_count * length(vars)
d.ct = left_join(d.ct, model_total(vars, 1, paste(prefix, '_total_for', sep='')))
d.ct = left_join(d.ct, model_total(vars, 0, paste(prefix, '_total_none', sep='')))
d.ct = left_join(d.ct, model_total(vars, -1, paste(prefix, '_total_against', sep='')))
}
# Neel vars
neel_vars = c('neel_better.mates','neel_big.family', 'neel_intelligence', 'neel_polygynous')
d.ct = all_totals(neel_vars, 'neel')
# Prestige vars
prest_vars = c('prestige_counsel','prestige_emulated', 'prestige_expertise', 'prestige_family', 'prestige_f.exp.success', 'prestige_likable', 'prestige_respected')
d.ct = all_totals(prest_vars, 'prest')
# Dominance vars
dom_vars = c('dom_aggression','dom_assert.authority', 'dom_avoid.dom', 'dom_fear', 'dom_fighting', 'dom_personality', 'dom_reputation', 'dom_strong')
d.ct = all_totals(dom_vars, 'dom')
#Hooper vars
hooper_vars = c('hooper_performance','hooper_sanction.freeriders','hooper_payoff','hooper_group.size','hooper_coop.activities')
d.ct = all_totals(hooper_vars, 'hooper')
# # Van Vugt vars
# vv_vars = c('vanvugt_coordinating', 'vanvugt_strategic')
# d.ct = all_totals('vanvugt_coordinating', 'vvcoord')
# d.ct = all_totals('vanvugt_strategic', 'vvstrat')
all_vars= c(neel_vars, prest_vars, dom_vars, hooper_vars)
## creating cumulative distriubtion table variables
d$evidence_prestige_for = (rowSums(d[,prest_vars]==1))
d$evidence_prestige_against = (rowSums(d[,prest_vars]==-1))
d$evidence_dom_for = (rowSums(d[,dom_vars]==1))
d$evidence_dom_against = (rowSums(d[,dom_vars]==-1))
d$evidence_neel_for = (rowSums(d[,neel_vars]==1))
d$evidence_neel_against = (rowSums(d[,neel_vars]==-1))
d$evidence_hooper_for = (rowSums(d[,hooper_vars]==1))
d$evidence_hooper_against = (rowSums(d[,hooper_vars]==-1))
# d$evidence_vv_for = (rowSums(d[,vv_vars]==1))
# d$evidence_vv_against = (rowSums(d[,vv_vars]==-1))
# # dplyr version (incomplete)
#
# d.ct.new2 = d %>%
# group_by(c_name) %>%
# summarise(
# extract_count = n(),
# neel_total_cell_count = n() * length(neel_vars),
# neel_total_for = sum((neel_better.mates==1) + (neel_big.family==1) + (neel_intelligence==1) + (neel_polygynous==1)),
# neel_total_none = sum((neel_better.mates==0) + (neel_big.family==0) + (neel_intelligence==0) + (neel_polygynous==0)),
# neel_total_against = sum((neel_better.mates==-1) + (neel_big.family==-1) + (neel_intelligence==-1) + (neel_polygynous==-1))
# )
## data set of all important culture level variables (*d.ct is main culture level data with model scores and model level variables)
# d.ct.old <-read.csv("cultures.totals.csv")
# d.ct.old$c_cultural_complexity <-as.numeric(d.ct$c_cultural_complexity)
#
# # Check d.ct against d.ct.old
# sum(d.ct$neel_total_for==d.ct.old$neel_total_for)/nrow(d.ct)
# sum(d.ct$neel_total_against==d.ct.old$neel_total_against)/nrow(d.ct)
#
# sum(d.ct$prest_total_for==d.ct.old$prest_total_for)/nrow(d.ct)
# sum(d.ct$prest_total_against==d.ct.old$prest_total_against)/nrow(d.ct)
#
# sum(d.ct$dom_total_for==d.ct.old$dom_total_for)/nrow(d.ct)
# sum(d.ct$dom_total_against==d.ct.old$dom_total_against)/nrow(d.ct)
## making data set of model totals by subsistence type - include total for, total against for each model (*d.sub is subsistence level, not used in main analyses)
d.sub <-read.csv("data-raw/d.sub_by_model_totals.csv")
##Creating data set of variable list with proportions (*d.prop is list of proportions at model level for extracts)
d.prop <-read.csv("data-raw/d.prop.csv")
##Merging cultures data with d.ct
d.ct=merge(cultures, d.ct, by='c_culture_code', all=T)
####RECODING / MANAGING VARIABLES
## Doing some recoding of subsistence level
d$subsistence2<-car::recode(d$c_subsistence_type, "'forager/food producers'= 'hunter gatherers'; 'foragers'='hunter gatherers';'horticulturalists'='horticulturalists'; 'intensive agriculturalists'='agriculturalists';'other'='other';'pastoral/agriculturalists'='pastoralists';'pastoralists'='pastoralists'")
d.ct$subsistence2<-car::recode(d.ct$c_subsistence_type, "'forager/food producers'= 'hunter gatherers'; 'foragers'='hunter gatherers';'horticulturalists'='horticulturalists'; 'intensive agriculturalists'='agriculturalists';'other'='other';'pastoral/agriculturalists'='pastoralists';'pastoralists'='pastoralists'")
#d.ct.old$subsistence2<-recode(d.ct.old$c_subsistence_type, "'forager/food producers'= 'hunter gatherers'; 'foragers'='hunter gatherers';'horticulturalists'='horticulturalists'; 'intensive agriculturalists'='agriculturalists';'other'='other';'pastoral/agriculturalists'='pastoralists';'pastoralists'='pastoralists'")
d$region2<-car::recode(d$c_subregion, "'Amazon and Orinoco'='South America'; 'Arctic and Subarctic'='North America'; 'Australia'='Insular Pacific'; 'British Isles'='Circum-Mediterranean';
'Central Africa'= 'Africa'; 'Central America'='South America'; 'Central Andes'='South America'; 'East Asia'='East Eurasia'; 'Eastern Africa'='Circum-Mediterranean';
'Eastern South America'='South America'; 'Eastern Woodlands'='North America'; 'Maya Area'='North America'; 'Melanesia'='Insular Pacific'; 'Micronesia'='Insular Pacific';
'Middle East'='Circum-Mediterranean'; 'North Asia'='East Eurasia'; 'Northern Africa'='Africa'; 'Northern Mexico'='North America';
'Northwest Coast and California'='North America'; 'Northwestern South America'='South America'; 'Plains and Plateau'='North America';
'Polynesia'='Insular Pacific'; 'Scandinavia'='Circum-Mediterranean'; 'South Asia'='East Eurasia'; 'Southeast Asia'='Insular Pacific';
'Southeastern Europe'='Circum-Mediterranean'; 'Southern Africa'='Africa'; 'Southern South America'='South America'; 'Southwest and Basin'= 'North America'; 'Western Africa'='Africa'")
d.ct$region2<-car::recode(d.ct$c_subregion, "'Amazon and Orinoco'='South America'; 'Arctic and Subarctic'='North America'; 'Australia'='Insular Pacific'; 'British Isles'='Circum-Mediterranean';
'Central Africa'= 'Africa'; 'Central America'='South America'; 'Central Andes'='South America'; 'East Asia'='East Eurasia'; 'Eastern Africa'='Circum-Mediterranean';
'Eastern South America'='South America'; 'Eastern Woodlands'='North America'; 'Maya Area'='North America'; 'Melanesia'='Insular Pacific'; 'Micronesia'='Insular Pacific';
'Middle East'='Circum-Mediterranean'; 'North Asia'='East Eurasia'; 'Northern Africa'='Africa'; 'Northern Mexico'='North America';
'Northwest Coast and California'='North America'; 'Northwestern South America'='South America'; 'Plains and Plateau'='North America';
'Polynesia'='Insular Pacific'; 'Scandinavia'='Circum-Mediterranean'; 'South Asia'='East Eurasia'; 'Southeast Asia'='Insular Pacific';
'Southeastern Europe'='Circum-Mediterranean'; 'Southern Africa'='Africa'; 'Southern South America'='South America'; 'Southwest and Basin'= 'North America'; 'Western Africa'='Africa'")
d.ct$subsistence2<-as.factor(d.ct$subsistence2)
##above, recoded d$region, according to SCCS region, using eHRAF subregion. BUT, one Southeast Asia (Central Thai) that should be East Eurasia coded as Insular Pacific
#### and 3 Western Africa coded as Africa that should be Circum Medeterainian (Dogon, Hause, Wolof)
###Recoding SCCS variables
d.ct$settlement_fixity2<-car::recode(d.ct$V61, "''='NA'; 'Impermanent-periodically moved'='Impermanent-periodically moved'; 'Migratory'='Migratory'; 'Permanent'='Permanent'; 'Rotating among 2+ fixed'='Impermanent-periodically moved'; 'Seminomadic-fixed then migratory'='Seminomadic'; 'Semisedentary-fixed core, some migratory'='Semisedentary'")
d.ct$com_size2<-car::recode(d.ct$V63, "''='NA'; '< 50'='< 99'; '1,000-4,999'='> 1,000'; '100-199'='100-199'; '200-399'='200-399'; '400-999'='400-999'; '50-99'='< 99'")
d.ct$pop_density2<-car::recode(d.ct$V64, "''='NA'; '< 1 person / 5 sq. mile'='≤ 1 person / 1-5 sq. mile'; '1 person / 1-5 sq. mile'='≤ 1 person / 1-5 sq. mile'; '1-25 persons / sq. mile'='1-25 persons / sq. mile'; '1-5 persons / sq. mile'='1-25 persons / sq. mile'; '101-500 persons / sq. mile'='101-500 persons / sq. mile'; '26-100 persons / sq. mile'='26-100 persons / sq. mile'; 'over 500 persons / sq. mile'='over 500 persons / sq. mile'")
d.ct$V69[d.ct$V69==""]=NA
d.ct$V69=factor(d.ct$V69)
#d.ct$V69<- recode(d.ct$V69, "'NA'='NA'; 'Ambilocal-w/ either wife\'s or husband\'s kin'='Ambilocal'; 'Avunculocal-w/husband\'s mother\'s brother\'s kin'='Avunculocal'; 'Matrilocal or uxorilocal-with wife\'s kin'='Matrilocal'; 'Neolocal-separate from kin'='Neolocal'; 'Patrilocal or virilocal'='Patrilocal'")
##### CREATING AGGREGATE DATA SET
## Summarize support for each variable by culture
# Score 1 if *any* extract within a culture has a 1 for that variable (*d.ct2 is culture level data [single count evidence for] for variable analyses)
# More accurately: balance of +1 against -1
d.ct2 = aggregate(d[,14:37], by=list(d$c_name), FUN = function(x) {(sum(x) > 0)*1})
d.ct2$c_name=d.ct2$Group.1
#creating a dataframe of means values of variables at culture level (*d.ct3 is culture level data [means of variables] for variables )
d.ct3 = aggregate(d[,14:37], by=list(d$c_name), FUN = mean)
##adding in some other culture level variables
#d.ct2 <-merge(d.ct, d.ct2, by="c_name")
## End
#### CREATING DATA FRAMES FOR MODELS ETC. (all extract level data frames for model totals)
## Creating variables based on models totaling for, none, and against
d$dom_for = rowSums(d[,c('dom_aggression', 'dom_fear', 'dom_assert.authority',
'dom_avoid.dom', 'dom_fighting', 'dom_personality',
'dom_reputation', 'dom_strong')]==1)
d$dom_against = rowSums(d[,c('dom_aggression', 'dom_fear', 'dom_assert.authority',
'dom_avoid.dom', 'dom_fighting', 'dom_personality',
'dom_reputation', 'dom_strong')]==-1)
d$dom_none = rowSums(d[,c('dom_aggression', 'dom_fear', 'dom_assert.authority',
'dom_avoid.dom', 'dom_fighting', 'dom_personality',
'dom_reputation', 'dom_strong')]==0)
d$dom_sum = d$dom_for + d$dom_against
d$prestige_for = rowSums(d[,c('prestige_counsel', 'prestige_emulated', 'prestige_expertise',
'prestige_family', 'prestige_f.exp.success', 'prestige_likable',
'prestige_respected')]==1)
d$prestige_against = rowSums(d[,c('prestige_counsel', 'prestige_emulated', 'prestige_expertise',
'prestige_family', 'prestige_f.exp.success', 'prestige_likable',
'prestige_respected')]==-1)
d$prestige_none = rowSums(d[,c('prestige_counsel', 'prestige_emulated', 'prestige_expertise',
'prestige_family', 'prestige_f.exp.success', 'prestige_likable',
'prestige_respected')]==0)
d$prestige_sum= d$prestige_for + d$prestige_against
d$neel_for = rowSums(d[,c('neel_better.mates', 'neel_big.family', 'neel_intelligence',
'neel_polygynous')]==1)
d$neel_against = rowSums(d[,c('neel_better.mates', 'neel_big.family', 'neel_intelligence',
'neel_polygynous')]==-1)
d$neel_none = rowSums(d[,c('neel_better.mates', 'neel_big.family', 'neel_intelligence',
'neel_polygynous')]==0)
d$neel_sum = d$neel_for + d$neel_against
d$hooper_for = rowSums(d[,c('hooper_performance','hooper_sanction.freeriders','hooper_payoff','hooper_group.size','hooper_coop.activities')]==1)
d$hooper_against = rowSums(d[,c('hooper_performance','hooper_sanction.freeriders','hooper_payoff','hooper_group.size','hooper_coop.activities')]==-1)
d$hooper_none = rowSums(d[,c('hooper_performance','hooper_sanction.freeriders','hooper_payoff','hooper_group.size','hooper_coop.activities')]==0)
d$hooper_sum = d$hooper_for + d$hooper_against
#### MERGING Culture level variables with extract level variables (*dm is extract level data, but with culture level variables attached to each extract)
dm <-merge(d, d.ct, by="c_name")
##new data frames by model (no culture name)
d.dom = d[,c(14:21)]
d.prest = d[,c(31:37)]
d.neel = d[,c(27:30)]
d.hooper = d[,c(22:26)]
#d.vv=d[,c(33:34)]
#d.vvstrat = d[,c(34)]
#d.vvcoord = d[,c(33)]
## these codes include culture names, removed from script
#d.dom = d[,c(5, 14:21)]
#d.prest = d[,c(5, 26:32)]
#d.neel = d[,c(5, 22:25)]
#d.vv=d[,c(5, 33:34)]
#d.vvstrat = d[,c(5, 34)]
#d.vvcoord = d[,c(5, 33)]
## making d.dom data set with only extracts that have data, e.g. removes all extracts with all noevidence
d.dom2 = d.dom[rowSums(d.dom!=0)!=0,]
## does not lose any cultures!!
## eliminating empty extracts from prestige model data set
d.prest2 = d.prest[rowSums(d.prest!=0)!=0,]
## does not loose any cultures
## for Neel model
d.neel2 = d.neel[rowSums(d.neel!=0)!=0,]
#does not lose any cultures
## for VV variables (both strat and coord in frame)
#d.vv2 = d.vv[rowSums(d.vv!=0)!=0,]
#does not lose any cultures
##making variables of totals for each model (these omit cases where there is a 1 and -1)
d$dom_totals = d$dom_aggression + d$dom_assert.authority + d$dom_avoid.dom + d$dom_fear + d$dom_fighting + d$dom_personality + d$dom_reputation + d$dom_strong
d$prest_totals = d$prestige_emulated + d$prestige_likable + d$prestige_counsel + d$prestige_expertise + d$prestige_f.exp.success + d$prestige_respected + d$prestige_family
d$neel_totals = d$neel_intelligence + d$neel_polygynous + d$neel_better.mates + d$neel_big.family
d$hooper_totals = d$hooper_performance + d$hooper_group.size + d$hooper_coop.activities + d$hooper_sanction.freeriders + d$hooper_payoff
## putting data in long form
d.tmp=melt(d[,c(1, 14:37)], id.vars='cs_ID')
head(d.tmp)
# table(d.tmp)
# table(d)
table(d.tmp$variable, d.tmp$value)
t=table(d.tmp$variable, d.tmp$value)
t
# Create new variables for variables with many -1 values
# Print tables of all variables to see which have many -1's
for (i in 11:29){
print(names(d)[i])
print(table(d[i]))
}
#Which bits of evidence against to keep?
t2 =t[t[,1]>0,]
t2[(t2[,1]/t2[,3])>=.10,]
leader_text_original <- d
#Cut off rule, keep all evidence against when it's greater than or equal to 10% of the codes for t
# dom_aggression has 14 -1's
d$dom_anti_aggression <- 0
d$dom_anti_aggression[d$dom_aggression == -1] <- 1
d$dom_aggression[d$dom_aggression == -1] <- 0
# dom_assert.authority has 83 -1's
d$dom_no_coercive_authority <- 0
d$dom_no_coercive_authority[d$dom_assert.authority == -1] <- 1
d$dom_assert.authority[d$dom_assert.authority == -1] <- 0
# dom_personality has 23 -1's
d$dom_non_dominant_personality <- 0
d$dom_non_dominant_personality[d$dom_personality == -1] <- 1
d$dom_personality[d$dom_personality == -1] <- 0
# hooper_sanction.freeriders has 3 -1s (to 16 1s)
d$hooper_no_sanctioning <- 0
d$hooper_no_sanctioning[d$hooper_sanction.freeriders == -1] <-1
d$hooper_sanction.freeriders[d$hooper_sanction.freeriders == -1] <- 0
# hooper_group.size has 3 -1s (to 16 1s)
d$hooper_leaderless_large_group <- 0
d$hooper_leaderless_large_group[d$hooper_group.size == -1] <-1
d$hooper_group.size[d$hooper_group.size == -1] <- 0
# hooper_group.size has 3 -1s (to 16 1s)
d$hooper_egalitarian_large_group <- 0
d$hooper_egalitarian_large_group[d$hooper_coop.activities == -1] <-1
d$hooper_coop.activities[d$hooper_coop.activities == -1] <- 0
# prestige_family has 12 -1's
d$prestige_no_family_prestige <- 0
d$prestige_no_family_prestige[d$prestige_family == -1] <- 1
d$prestige_family[d$prestige_family == -1] <- 0
# prestige_likable has 19 -1's
d$prestige_unlikeable <- 0
d$prestige_unlikeable[d$prestige_likable == -1] <- 1
d$prestige_likable[d$prestige_likable == -1] <- 0
# prestige_respected has 28 -1's
d$prestige_not_respected <- 0
d$prestige_not_respected[d$prestige_respected == -1] <- 1
d$prestige_respected[d$prestige_respected == -1] <- 0
#Removing codes of no evidence when cases of no evidence is less than 10%
d$dom_avoid.dom[d$dom_avoid.dom == -1] <- 0
d$dom_fear[d$dom_fear == -1] <- 0
d$dom_fighting[d$dom_fighting == -1] <- 0
d$dom_reputation[d$dom_reputation == -1] <- 0
d$hooper_performance[d$hooper_performance == -1] <- 0
d$hooper_sanction.freeriders[d$hooper_sanction.freeriders == -1] <- 0
d$hooper_payoff[d$hooper_payoff == -1] <- 0
d$hooper_group.size[d$hooper_group.size == -1] <- 0
d$hooper_coop.activities[d$hooper_coop.activities == -1] <- 0
d$neel_intelligence[d$neel_intelligence == -1] <- 0
d$prestige_counsel[d$prestige_counsel == -1] <- 0
d$prestige_emulated[d$prestige_emulated == -1] <- 0
d$prestige_expertise[d$prestige_expertise == -1] <- 0
#Fix emulated coding
d$prestige_emulated[d$cs_textrec_ID==1092] <- 1
#Remove cases of all 0s on theory variables
d<-d[rowSums(d[,c(14:37,73:81)])>0,]
## Compute mean and present variables at culture level
present = function(x) {(sum(x) > 0)*1}
d.ct <- d %>%
dplyr::select(c_name, one_of(all_vars)) %>%
group_by(c_name) %>%
summarise_all(lst(present, mean)) %>%
right_join(d.ct, by='c_name')
# Compute models scores by culture
## Left off here Oct 30, 2019
## Need to add model scores to d.ct, which has 1 extra row (59)
## Possible strategy: create data frame with the 58 rows
## for which we have data, and then merge with d.ct, which has all 59 rows
# Create a data frame of just the 58 cultures first
d_58<-d[!d$c_name=="Saramaka",]
d_58$c_name<-factor(d_58$c_name)
library(data.table)
d_58<-setorder(d_58, c_name)
# Calculate model scores
df_modelscores <-
tibble(
c_name = factor(unique(d_58$c_name)),
neel_cult_score = as.numeric(by(d_58[,c('c_name', neel_vars)], d_58$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T))),
prest_cult_score = as.numeric(by(d_58[,c('c_name', prest_vars)], d_58$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T))),
dom_cult_score = as.numeric(by(d_58[,c('c_name', dom_vars)], d_58$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T))),
hooper_cult_score = as.numeric(by(d_58[,c('c_name', hooper_vars)], d_58$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T)))
)
# Merge model scores into d.ct
d.ct <- left_join(d.ct, df_modelscores, by = "c_name")
# d$c_name<-factor(d$c_name)
# d.ct$neel_cult_score = as.numeric(by(d[,c('c_name', neel_vars)], d$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T)))
# d.ct$prest_cult_score = as.numeric(by(d[,c('c_name', prest_vars)], d$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T)))
# d.ct$dom_cult_score = as.numeric(by(d[,c('c_name', dom_vars)], d$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T)))
# d.ct$hooper_cult_score = as.numeric(by(d[,c('c_name', hooper_vars)], d$c_name, FUN=function(x) mean(as.matrix(x[,-1]), na.rm=T)))
#Final renaming
d$subsistence<-d$subsistence2
d$region<-d$region2
d.ct$subsistence<-d.ct$subsistence2
d.ct$region<-d.ct$region2
d.ct$settlement_fixity<-d.ct$settlement_fixity2
d.ct$com_size<-d.ct$com_size2
d.ct$pop_density<-d.ct$pop_density2
d.ct$c_cultural_complexity[d.ct$c_name=='Hopi'] = 35
d.ct$c_cultural_complexity[d.ct$c_name=='Mataco'] = 18
doc_table<-read.csv('data-raw/culture_totals.csv')
doc_table<-doc_table[,c(1,14)]
d.ct<-left_join(d.ct,doc_table)
d_raw.text<-read_csv("data-raw/raw_text_updated.csv")
d_raw.text$raw_text<-d_raw.text$t_text
d_raw.text<-d_raw.text[,c("document_d_ID", "cs_textrec_ID", "t_split_record", "t_split_record_info", "raw_text")]
d_final<-left_join(d,d_raw.text, by="cs_textrec_ID")
# d.ctPKG<-d.ct[,c(1:53,132:167)]
levels(d.ct$subsistence) <-c("agriculturalists", "horticulturalists", "hunter gatherers", "mixed", "pastoralists")
levels(d.ct$subsistence2) <-c("agriculturalists", "horticulturalists", "hunter gatherers", "mixed", "pastoralists")
d.ctPKG <- dplyr::select(d.ct,
c_name,c_culture_code,
Location,
V158.1,
c_cultural_complexity:hooper_total_against,
neel_cult_score:documents
)
leader_cult <- cultures %>%
dplyr::select(c_culture_code, Name, Region) %>%
left_join(d.ctPKG, by = 'c_culture_code')
# Recode cultural char vars as ordinal
leader_cult$com_size <-
ordered(
leader_cult$com_size,
levels = c("< 99", "100-199", "200-399", "400-999", "> 1,000")
)
leader_cult$pop_density <-
ordered(
leader_cult$pop_density,
levels = c(
"1 or less person / 1-5 sq. mile",
"1-5 persons / sq. mile",
"1-25 persons / sq. mile",
"26-100 persons / sq. mile",
"101-500 persons / sq. mile",
"over 500 persons / sq. mile"
)
)
# Create leader_text
leader_text<-dplyr::select(d_final,
cs_ID:evidence_hooper_against,
dom_for:raw_text)
# Rename text record data frame -------------------------------------------
text_records <- d_raw.text
# Words data frame for text analysis -------------------------------
leader_words <-
text_records %>%
dplyr::select(cs_textrec_ID, raw_text) %>%
mutate(
raw_text = iconv(raw_text, "", "UTF-8"),
raw_text = str_replace(raw_text, "’", "'") # Switch ’ to '
) %>%
unnest_tokens(word, raw_text) %>%
mutate(
word = str_to_lower(word),
word = str_replace(word, "'s", "")
) %>%
dplyr::filter(!str_detect(word, "\\d+")) %>%
dplyr::filter(!str_detect(word, 'page')) %>%
anti_join(stop_words) %>%
dplyr::filter(str_count(word) > 2 | word == 'ox') %>%
mutate(lemma = lemmatize_words(word))
# document-term matrix
leader_dtm <-
leader_words %>%
dplyr::select(cs_textrec_ID, lemma) %>%
group_by(cs_textrec_ID, lemma) %>%
summarise(count = n()) %>%
spread(lemma, count, fill = 0) %>%
ungroup
# Two letter words that were removed (and their frequency). Kept "ox"
#
# ac ax er fo fr fu ga ha hò ix la lb mm mo na ne nw oi ot pe ph pt qa ra sa sp
# 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# ta tb ti ué uj ur wi xv ba ea es gu im ko ma ms mu nä ol pa se sh uu xi du ed
# 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
# iv ki ku oa pl si st vi ya ad bu ge li po en op ri wa al ch dr el ho uh ii
# 3 3 3 3 3 3 3 3 3 4 4 4 4 4 5 5 5 5 6 6 7 7 9 9 13
# ka de pp cf
# 13 14 20 22
# Import documents data frame ---------------------------------------------
documents <-
read_excel("data-raw/documents.xlsx") %>%
rename(
d_publication_date = `d_publication date`
) %>%
mutate(
d_field_date_start = as.numeric(d_field_date_start),
d_field_date_end = as.numeric(d_field_date_end),
d_publication_date = as.numeric(d_publication_date),
# One missing pub date. Replace with d_field_date_end + 5 yrs
d_publication_date = case_when(
is.na(d_publication_date) ~ d_field_date_end + 5,
TRUE ~ d_publication_date
)
) %>%
dplyr::select(d_ID, d_title, d_publication_date, everything())
# Text record coding related to leadership costs, benefits, qualit --------
# qualities, functions, and group structure
leader_text2 <- read_csv("data-raw/reconciled_coding2.csv")
# Reset code sheet IDs for d2 (leader_text2) to avoid any confusion with leader_text coding.
leader_text2$cs_ID<-seq.int(20001,21212)
# Recode -1's
# First, figure out which vars have -1's, and how many
negs <-
leader_text2[-c(1:3)] %>%
select_if(is.numeric) %>%
map_dbl(~sum(.x < 0))
# An how many +1's
pos <-
leader_text2[-c(1:3)] %>%
select_if(is.numeric) %>%
map_dbl(~sum(.x > 0))
# if ratio of negs to pos is > 0.1 create new anti vars
ratio <- negs/pos
ratio[ratio>0.1]
leader_text2$qualities_AntiHonest <-
ifelse(leader_text2$qualities_Honest == -1, 1, 0)
leader_text2$qualities_AntiFairness <-
ifelse(leader_text2$qualities_Fairness == -1, 1, 0)
leader_text2$qualities_AntiDrugConsumption <-
ifelse(leader_text2$qualities_DrugConsumption == -1, 1, 0)
leader_text2$qualities_AntiCoerciveAuthority <-
ifelse(leader_text2$qualities_CoerciveAuthority == -1, 1, 0)
leader_text2_original <- leader_text2
leader_text2 <-
map_if(leader_text2_original, is.numeric, ~ ifelse(.x < 0, 0, .x)) %>%
as_tibble
# Compare to negs, pos
negs2 <-
leader_text2[-c(1:3)] %>%
select_if(is.numeric) %>%
map_dbl(~sum(.x < 0))
pos2 <-
leader_text2[-c(1:3)] %>%
select_if(is.numeric) %>%
map_dbl(~sum(.x > 0))
# Import author and authorship data ---------------------------------------
authorship <- read_excel("data-raw/authorship.xlsx")
# Recode variables --------------------------------------------------------
# Add sex
leader_text_original$demo_sex <- as.character(leader_text_original$demo_sex)
leader_text_original$demo_sex[leader_text_original$demo_sex == "-1"] <- "male"
leader_text_original$demo_sex[leader_text_original$demo_sex == "unkown"] <- "unknown"
leader_text2 <- left_join(leader_text2, leader_text_original[c("cs_textrec_ID", "demo_sex")])
# Aggregate group structure types
group_str <- c(
"state-level group" = "political group (supracommunity)",
"religeous group" = "religious group", # correct spelling
"political group (community)" = "political group (community)",
"political group (supracommunity)" = "political group (supracommunity)",
"military group" = "military group",
"economic group" = "economic group",
"criminal group" = "economic group",
"labor group" = "economic group",
"subsistence group" = "economic group",
"age-group" = "residential subgroup",
"domestic group" = "kin group",
"kin group" = "kin group",
"local group" = "residential subgroup",
"performance group" = "residential subgroup",
"other" = "other",
"multiple domains" = "other",
"unkown" = "other"
)
leader_text2$group.structure2 <- leader_text2$group.structure_Coded
leader_text2$group.structure2 <- group_str[leader_text2$group.structure2]
# Female coauthor ---------------------------------------------------------
female_coauthor <- function(document_ID){
author_genders <- authorship$author_gender[authorship$document_ID == document_ID]
'female' %in% author_genders
}
documents$female_coauthor <- map_lgl(documents$d_ID, female_coauthor)
# Merge more culture-level data -------------------------------------------
# number of text records for each culture
tbl_ntr <- table(leader_text_original$c_culture_code)
dfntr <- tibble(
"OWC Code" = names(tbl_ntr),
number_leader_records = as.numeric(tbl_ntr)
)
# get number of pages for PSF cultures only
dfwc <-
read_excel("data-raw/WC_SummaryPublishedCollections_20180625.xlsx") %>%
dplyr::filter(`PSF eHRAF` == 'Yes') %>%
left_join(dfntr) %>%
dplyr::select(`OWC Code`, `N Pages eHRAF`, number_leader_records)
leader_cult <-
leader_cult %>%
left_join(dfwc, by = c('c_culture_code' = 'OWC Code')) %>%
rename(
ehraf_pages = `N Pages eHRAF`
)
# Convert factors to chars ------------------------------------------------
documents <- mutate_if(documents, is.factor, as.character)
authorship <- mutate_if(authorship, is.factor, as.character)
leader_text <- mutate_if(leader_text, is.factor, as.character)
leader_cult <- mutate_if(leader_cult, is.factor, as.character)
leader_text_original <- mutate_if(leader_text_original, is.factor, as.character)
text_records <- mutate_if(text_records, is.factor, as.character)
leader_text2 <- mutate_if(leader_text2, is.factor, as.character)
# Nice var names ----------------------------------------------------------
var_names <- c(
"functions_BestowMate" = "Bestow mates",
"functions_PoliticalAppointments" = "Political appointments",
"functions_ConstructionInfrastructure" = "Construction/infastructure",
"functions_ControlEconomics" = "Control economics",
"functions_CouncilMember" = "Council member",
"functions_GroupDetermination" = "Group determination/cohesiveness",
"functions_Hospitality" = "Hospitality",
"functions_MilitaryCommand" = "Military command",
"functions_NewSettlement" = "Movement/migration",
"functions_ProsocialInvestment" = "Prosocial investment",
"functions_ProvideCounsel" = "Provide counsel/direction",
"functions_Punishment" = "Punishment",
"functions_ServeLeader" = "Serve a leader",
"functions_StrategicPlanning" = "Strategic planning",
"functions_OrganizeCooperation" = "Organize cooperation",
"functions_ResolveConflcit" = "Resolve conflict",
"functions_ControlCalendar" = "Control calendar",
"functions_ControlImmigration" = "Control immigration",
"functions_DistributeResources" = "Distribute resources",
"functions_GroupRepresentative" = "Group representative",
"functions_Medicinal" = "Medicinal functions",
"functions_MoralAuthority" = "Moral authority",
"functions_Policymaking" = "Policy making",
"functions_Protection" = "Protection",
"functions_ProvideSubsistence" = "Provide subsistence",
"functions_Ritual" = "Ritual functions",
"functions_SocialFunctions" = "Misc. social functions",
"qualities_ArtisticPerformance" = "Artistic performance",
"qualities_Generous" = "Generosity",
"qualities_Age" = "Age",
"qualities_Attractive" = "Attractive",
"qualities_CoerciveAuthority" = "Coercive authority",
"qualities_CulturallyProgressive" = "Culturally progressive",
"qualities_FavorablePersonality" = "Favorable personality",
"qualities_Honest" = "Honesty",
"qualities_IngroupMember" = "Ingroup member",
"qualities_Killer" = "Killer",
"qualities_ManyChildren" = "Many children",
"qualities_PhysicallyStrong" = "Physically formidable",
"qualities_Prosocial" = "Prosocial",
"qualities_StrategicPlanner" = "Strategic planner",
"qualities_DrugConsumption" = "Drug consumption",
"qualities_HighStatus" = "High status",
"qualities_Aggressive" = "Aggressiveness",
"qualities_Bravery" = "Bravery",
"qualities_Confident" = "Confidence",
"qualities_Decisive" = "Decisiveness/decision-making",
"qualities_Feared" = "Feared",
"qualities_Humble" = "Humility",
"qualities_Innovative" = "Innovative",
"qualities_KnowlageableIntellect" = "Knowledgeable/intelligent",
"qualities_OratorySkill" = "Oratory skill",
"qualities_Polygynous" = "Polygynous",
"qualities_SocialContacts" = "Social contacts",
"qualities_Supernatural" = "Supernatural",
"qualities_ExpAccomplished" = "Experienced/accomplished",
"qualities_Wealthy" = "Wealthy",
"qualities_Ambition" = "Ambitious",
"qualities_Charisma" = "Charisma",
"qualities_CulturallyConservative" = "Culturally conservative",
"qualities_Fairness" = "Fairness",
"qualities_HighQualitySpouse" = "High-quality spouse",
"qualities_Industriousness" = "Industriousness",
"qualities_InterpersonalSkills" = "Interpersonal skills",
"qualities_Loyalty" = "Loyalty",
"qualities_PhysicalHealth" = "Physical health",
"qualities_ProperBehavior" = "Proper behavior",
"qualities_StrategicNepotism" = "Strategic nepotism",
"qualities_Xenophobic" = "Xenophobia",
"qualities_AntiHonest" = "Dishonest",
"qualities_AntiFairness" = "Unfair",
"qualities_AntiDrugConsumption" = "No drug consumption",
"qualities_AntiCoerciveAuthority" = "No coercive authority",
leader.benefits_Fitness = 'Inclusive fitness benefit',
leader.benefits_Mating = 'Mating benefit',
leader.benefits_Other = 'Misc. non-material benefit',
leader.benefits_RiskHarmConflict = 'Reduced risk of harm',
leader.benefits_ResourceFood = 'Food benefit',
leader.benefits_ResourceOther = 'Material resources',
leader.benefits_SocialServices = 'Social services',
leader.benefits_SocialStatusReputation = 'Increased social status',
leader.benefits_Territory = 'Territory',
follower.benefits_Fitness = 'Inclusive fitness benefit',
follower.benefits_Mating = 'Mating benefit',
follower.benefits_Other = 'Misc. non-material benefit',
follower.benefits_RiskHarmConflict = 'Reduced risk of harm',
follower.benefits_ResourceFood = 'Food benefit',
follower.benefits_ResourceOther = 'Material resources',
follower.benefits_SocialServices = 'Social services',
follower.benefits_SocialStatusReputation = 'Increased social status',
follower.benefits_Territory = 'Territory',
leader.costs_Fitness = 'Inclusive fitness cost',
leader.costs_RiskHarmConflict = 'Increased risk of harm',
leader.costs_Other = 'Misc. non-material cost',
leader.costs_ResourceFood = 'Food cost',
leader.costs_ResourceOther = 'Loss of material resources',
leader.costs_SocialStatusReputation = 'Reduced social status',
leader.costs_Territory = 'Loss of territory',
leader.costs_Mating = 'Mating cost',
leader.costs_SocialServices = 'Loss of social services',
follower.costs_Fitness = 'Inclusive fitness cost',
follower.costs_RiskHarmConflict = 'Increased risk of harm',
follower.costs_Mating = 'Mating cost',
follower.costs_Other = 'Misc. non-material cost',
follower.costs_ResourceFood = 'Food cost',
follower.costs_ResourceOther = 'Loss of material resources',
follower.costs_SocialServices = 'Loss of social services',
follower.costs_SocialStatusReputation = 'Reduced social status',
follower.costs_Territory = 'Loss of territory'
)
# Write data --------------------------------------------------------------
use_data(documents, authorship, leader_text, leader_text2_original, leader_text2, leader_cult, leader_text_original, text_records, leader_words, leader_dtm, var_names, overwrite=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.